Load libraries
library(lubridate)
library(ggplot2)
library(dplyr)
library(tidytext)
library(quanteda)
library(scales)
library(LSX)
Load the data.
data_wel <- readRDS("data/data_wel_nouns_sample")
#Scaling Load the Korean sentiment lexicon.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## term = col_character()
## )
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## term = col_character()
## )
See the top sentiment words.
tidy_news <- data_wel %>%
unnest_tokens(word, text)
tidy_news %>%
inner_join(senti) %>%
count(word, sort = TRUE)
## Joining, by = "word"
senti_news <- tidy_news %>%
inner_join(senti) %>%
count(Newspaper, Prezparty, sentiment) %>%
tidyr::spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
senti_word_counts <- tidy_news %>%
inner_join(senti) %>%
filter(Government == "1990-1993 Roh TW") %>%
count(word, sentiment, Government, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
senti_word_counts
Plot the top sentiment words.
senti_word_counts %>%
group_by(sentiment) %>%
top_n(25) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip() +
scale_fill_grey() +
theme_bw()
## Selecting by n

#Sys.setlocale(locale = "C")
dict_sentiment <- dictionary(list(positive = c("안정", "개선", "강화", "보장", "확보", "참여"),
negative = c("부담", "투쟁", "논란", "포퓰리즘", "반발", "논쟁")))
dict_sentiment
## Dictionary object with 2 key entries.
## - [positive]:
## - 안정, 개선, 강화, 보장, 확보, 참여
## - [negative]:
## - 부담, 투쟁, 논란, 포퓰리즘, 반발, 논쟁
# tokenize text corpus and remove various features
corp_sent <- data_wel %>%
unique() %>%
mutate(Body = gsub("</?[^>]+>|▲ 종이신문보기", "", Body)) %>%
corpus() %>%
corpus_reshape(to = "sentences")
toks <- corp_sent %>%
tokens()
# create a document feature matrix from the tokens object
dfmat <- toks %>%
dfm(remove = "") %>%
dfm_trim(min_termfreq = 5)
topfeatures(dfmat, 20)
## 복지 사회 정부 경제 우리 정책 지원 지역 문제 한국 국민
## 109966 74597 62288 48535 44904 42000 41518 39668 39080 38377 37989
## 교육 사람 기자 서울 때문 의원 장애 사업 대통령
## 37510 36415 35667 34988 33351 32042 31837 30952 30112
seed <- as.seedwords(dict_sentiment)
seed
## 안정 개선 강화 보장 확보 참여 부담 투쟁
## 1 1 1 1 1 1 -1 -1
## 논란 포퓰리즘 반발 논쟁
## -1 -1 -1 -1
# identify context words
context_terms <- char_context(toks, pattern = "*복지*", p = 0.05)
# run LSS model
tmod_lss <- textmodel_lss(dfmat, seeds = seed,
terms = context_terms, k = 300, cache = TRUE)
## Writing cache file: lss_cache/svds_1cd9885e9eae5c8c.RDS
Look up key periods
#Sys.setlocale(locale = "Korean")
data_wel %>%
filter(grepl("로타",text)) %>%
select(Body)
head(coef(tmod_lss), 20) # most positive words
## 강화 안정 확충 개선 농어 통한 내실화 보장
## 0.3555332 0.3334762 0.3271113 0.3120294 0.3100376 0.2961158 0.2922238 0.2869180
## 확보 업인 구축 구현 기반 향상 우선 중점
## 0.2636897 0.2570059 0.2565135 0.2553314 0.2290634 0.2154659 0.2152102 0.2130866
## 여건 역점 처우 시책
## 0.2128337 0.2114164 0.2109081 0.2022648
tail(coef(tmod_lss), 20) # most negative words
## 부가 부자 감세 부담 밥그릇 무상급식 등록금
## -0.1604088 -0.1620336 -0.1639986 -0.1683063 -0.1690423 -0.1743405 -0.1808749
## 무상 이슈 반값 재정부 수십조 철회 공짜
## -0.1858070 -0.1916298 -0.1943748 -0.1946259 -0.1970140 -0.2114392 -0.2188315
## 영합주의 당론 포퓰리즘 반발 논쟁 논란
## -0.2572511 -0.2656321 -0.2885387 -0.3006878 -0.3331700 -0.3710353
textplot_terms(tmod_lss, dict_sentiment[c("negative", "positive")])

# ggsave("plots/6_wel_seeds.jpg", width=8, height= 5, dpi = 300)
dfmat <- dfm_group(dfmat)
# predict sentiment scores
pred <- as.data.frame(predict(tmod_lss, se.fit = TRUE, newdata = dfmat))
pred$date <- docvars(dfmat, "Date")
pred$Newspaper <- docvars(dfmat, "Newspaper")
plot(pred$date, pred$fit, col = rgb(0, 0, 0, 0.05),
pch = 20, ylim = c(-1, 1),
ylab = "Threat Intensity")

pred_sm_chos <- pred %>%
filter(Newspaper == "Chosun") %>%
smooth_lss(engine = "locfit")
pred_sm_hani <- pred %>%
filter(Newspaper == "Hankyoreh") %>%
smooth_lss(engine = "locfit")
pred_sm_hankook <- pred %>%
filter(Newspaper == "Hankook") %>%
smooth_lss(engine = "locfit")
#Sys.setlocale(locale = "Korean")
head(pred_sm_chos)
head(pred_sm_hani)
head(pred_sm_hankook)
plot trend
x <- bind_rows("Chosun" = pred_sm_chos, "Hankyoreh" = pred_sm_hani, .id = "Newspaper") %>%
mutate(date = as.Date(date, format = "ymd")) %>%
mutate(Newspaper = as.factor(Newspaper)) %>%
ggplot(aes(date, fit, group = Newspaper, color = Newspaper, fill = Newspaper)) +
annotate("rect", xmin = as.Date("1998-02-25"), xmax = as.Date("2008-02-2"),
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_line() +
geom_ribbon(aes(ymin = fit + se.fit, ymax = fit - se.fit), alpha = 0.3) +
geom_hline(yintercept = 0) +
scale_color_manual(values = c("grey20", "grey70")) +
theme_bw() +
labs(x = "Year", y = "Negative vs. Positive",
caption = "1. 2002 South Korean local elections\n2. Seoul City's anti-free lunch referendum initiated by mayor (conservative) over free welfare controversies") +
scale_x_date(breaks = "1 year", labels = date_format("%Y"),
limits = as.Date(c("1990-01-01", "2014-12-31")), expand = c(0,0)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "top") +
scale_fill_manual(values=c("grey20", "grey70")) +
annotate(geom = "text", x = as.Date("1994-01-01"), y = 0.85,
label = "Conservative\ngovernments", hjust = "center") +
annotate(geom = "text", x = as.Date("2003-01-01"), y = 0.85,
label = "Liberal\ngovernments", hjust = "center") +
annotate(geom = "text", x = as.Date("2012-01-01"), y = 0.85,
label = "Conservative\ngovernments", hjust = "center") +
annotate(geom = "text", x = as.Date("2002-06-13"), y = 0.6,
label = "1", hjust = "center", size = 4) +
annotate(geom = "text", x = as.Date("2011-08-24"), y = 0.6,
label = "2", hjust = "center", size = 4) +
annotate("segment", x = as.Date(c("2002-06-13", "2011-08-24")),
xend = as.Date(c("2002-06-13", "2011-08-24")),
y = - Inf, yend = 0.55, linetype = 3)
x

# ggsave("plots/6_wel_lss.jpg", width=9, height= 5, dpi = 300)
Look up key periods
data_wel %>%
filter(Date > "2007-1-01" & Date < "2008-01-31") %>%
mutate(Body = gsub("</?[^>]+>|▲ 종이신문보기", "", Body))